library(RColorBrewer)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.0
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(readr)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(tidyverse)
library(stringr)
library(dplyr)
library(formattable)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:formattable':
##
## style
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(readr)
library(htmltools)
library(htmlwidgets)
bin148 <- read_csv("bin148_topfiftyhits.txt",
col_names = c("qseqid", "sseqid", "evalue", "staxids", "sscinames", "sskingdoms", "stitle"))
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 7768 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): qseqid, sseqid, staxids, sscinames, sskingdoms, stitle
## dbl (1): evalue
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bin148_eval <- bin148 %>%
filter(as.numeric(evalue) <= 1e-10)
bin148_kingdomcount <- bin148_eval %>%
count(sskingdoms)
bin148_kingdomcount
## # A tibble: 5 × 2
## sskingdoms n
## <chr> <int>
## 1 Archaea 117
## 2 Bacteria 1739
## 3 Eukaryota 1050
## 4 N/A 3
## 5 Viruses 1566
piechart_bin148 <- ggplot(bin148_kingdomcount, aes(x="", y=n, fill=sskingdoms)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
scale_fill_brewer(palette = "RdPu") +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
piechart_bin148
bin148_rmduplicates <- bin148_eval %>%
group_by(qseqid) %>%
distinct(sseqid, .keep_all = TRUE)
bin148_eukaryotes <- bin148_rmduplicates %>%
filter(sskingdoms == "Eukaryota") %>%
group_by(qseqid, sseqid, staxids) %>%
slice_min(order_by = evalue) %>%
ungroup() %>%
mutate(genus = word(sscinames, 1))
bin148_eukaryotes
## # A tibble: 714 × 8
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle genus
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 Contig103910588_1 dbj|GFR915… 7.64e- 73 1093978 Elysia… Eukary… CAP-s… Elys…
## 2 Contig103910588_1 emb|CAF097… 1.77e- 39 104777 Brachi… Eukary… unnam… Brac…
## 3 Contig103910588_1 gb|KAG2378… 3.7 e- 40 51637 Naegle… Eukary… hypot… Naeg…
## 4 Contig103910588_1 gb|KAH3765… 1.45e- 42 1580589 Pelomy… Eukary… poly … Pelo…
## 5 Contig103910588_1 gb|KJE9221… 9.59e- 40 595528 Capsas… Eukary… hypot… Caps…
## 6 Contig103910588_1 ref|XP_004… 3.06e- 40 595528 Capsas… Eukary… hypot… Caps…
## 7 Contig103910588_1 ref|XP_013… 5.36e- 39 461836 Thecam… Eukary… hypot… Thec…
## 8 Contig103910588_1 ref|XP_044… 1.41e- 37 51637 Naegle… Eukary… uncha… Naeg…
## 9 Contig103910588_10 dbj|GFR915… 5.76e-148 1093978 Elysia… Eukary… ribon… Elys…
## 10 Contig103910588_10 emb|CAB337… 5.84e-103 197152 Cloeon… Eukary… Hypot… Cloe…
## # … with 704 more rows, and abbreviated variable names ¹sscinames, ²sskingdoms
# Count the number of times each genus appears for a specific protein sequence
bin148_egenus_counts <- bin148_eukaryotes %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 1) # Only include those with more than one count
print(bin148_egenus_counts)
## # A tibble: 102 × 3
## # Groups: qseqid, genus [102]
## qseqid genus count
## <chr> <chr> <int>
## 1 Contig103910588_1 Capsaspora 2
## 2 Contig103910588_1 Naegleria 2
## 3 Contig103910588_10 Danio 2
## 4 Contig103910588_10 Drosophila 18
## 5 Contig103910588_10 Penaeus 4
## 6 Contig103910588_10 Takifugu 2
## 7 Contig103910588_4 Rhizophagus 6
## 8 Contig103910588_5 Albugo 2
## 9 Contig103910588_5 Aphanomyces 18
## 10 Contig103910588_5 Peronosclerospora 2
## # … with 92 more rows
# Count the number of times each genus appears for a specific protein sequence
bin148_egenus_counts_3 <- bin148_eukaryotes %>%
group_by(qseqid, genus) %>%
summarise(count = n(), .groups = "keep") %>%
filter(count > 1) # Only include those with more than one count
print(bin148_egenus_counts_3)
## # A tibble: 102 × 3
## # Groups: qseqid, genus [102]
## qseqid genus count
## <chr> <chr> <int>
## 1 Contig103910588_1 Capsaspora 2
## 2 Contig103910588_1 Naegleria 2
## 3 Contig103910588_10 Danio 2
## 4 Contig103910588_10 Drosophila 18
## 5 Contig103910588_10 Penaeus 4
## 6 Contig103910588_10 Takifugu 2
## 7 Contig103910588_4 Rhizophagus 6
## 8 Contig103910588_5 Albugo 2
## 9 Contig103910588_5 Aphanomyces 18
## 10 Contig103910588_5 Peronosclerospora 2
## # … with 92 more rows
distinct_colors <- c("skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue", "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin", "skyblue1", "#E6F5D0", "#B8E186", "#7FBC41", "#4D9221", "#276419", "#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999", "#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C", "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#FBB4AE", "#B3CDE3", "#CCEBC5", "#DECBE4", "#FED9A6", "midnightblue", "#E5D8BD", "cornflowerblue", "#8DD3C7", "royalblue", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "darkcyan", "#D9D9D9", "#BC80BD", "#CCEBC5", "chartreuse2", "#7FC97F", "#BEAED4", "purple", "cadetblue", "#386CB0", "#F0027F", "#BF5B17", "#666666", "#8E0152", "#C51B7D", "#DE77AE", "#F1B6DA", "#FDE0EF", "lightpink", "lightgreen", "mediumpurple1", "lightgoldenrod2", "lavender", "lavenderblush1", "lightblue2", "lightcyan1", "lemonchiffon2", "lawngreen", "khaki3", "ivory2", "indianred2", "hotpink2", "honeydew2", "mistyrose1", "midnightblue", "mediumvioletred", "mediumslateblue", "mediumpurple3", "mediumorchid1", "maroon2", "maroon", "limegreen", "lightsteelblue", "lightskyblue", "lightseagreen", "lightsalmon", "lightpink2", "pink", "peachpuff", "palevioletred1", "paleturquoise", "palegreen", "orchid1", "orange1", "olivedrab1", "purple2", "plum1", "steelblue1", "royalblue1", "skyblue", "coral1", "chartreuse2", "darkgreen", "deepskyblue", "cornflowerblue", "darkcyan", "moccasin")
bin148_eukaryotes_ggplot_genusFreq <- ggplot(bin148_egenus_counts_3, aes(x = genus, y = count)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
theme_minimal() +
theme(text = element_text(size = 20), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin148_eukaryotes_ggplot_genusFreq
bin148_eukaryotes_ggplot <- ggplot(bin148_egenus_counts_3, aes(x = qseqid, y = count, fill = genus)) +
geom_bar(stat = "identity", width = 0.7, position = "stack") +
scale_y_continuous(labels = function(y) str_wrap(y, width = 3)) +
theme_minimal() +
scale_fill_manual(values = distinct_colors) +
theme(text = element_text(size = 25), axis.text.x = element_text(angle = 90, hjust = 1), legend.position = "bottom")
bin148_eukaryotes_ggplot
#### Using ggsave to save .png formats of ggplots.
ggsave(“bin148_genusFreq_qseqid.png”, plot = bin148_eukaryotes_ggplot,
width = 20, height = 10)
ggplotly(bin148_eukaryotes_ggplot)
library(DT)
bin148_viruses <- bin148_rmduplicates %>%
filter(sskingdoms == "Viruses")
bin148_viruses
## # A tibble: 1,466 × 7
## # Groups: qseqid [60]
## qseqid sseqid evalue staxids sscin…¹ sskin…² stitle
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 Contig2728381_4 gb|WEG42285.1| 5.98e-53 10497 Africa… Viruses BA71V…
## 2 Contig2728381_4 sp|P0CAC2.1| 7.46e-53 10500 Africa… Viruses RecNa…
## 3 Contig2728381_4 gb|QRY19113.1| 9.61e-53 10497 Africa… Viruses BA71V…
## 4 Contig2728381_4 ref|YP_009702820.1| 1.24e-52 10497;56… Africa… Viruses BA71V…
## 5 Contig2728381_4 ref|YP_009702985.1| 1.7 e-52 10497;29… Africa… Viruses BA71V…
## 6 Contig2728381_4 ref|YP_009927211.1| 1.81e-52 10497 Africa… Viruses hypot…
## 7 Contig2728381_4 gb|QID21254.1| 2.67e-52 10497 Africa… Viruses pB263…
## 8 Contig2728381_4 gb|UNZ12432.1| 2.73e-52 10497 Africa… Viruses B263R…
## 9 Contig2728381_4 gb|UPH95742.1| 2.76e-52 10497 Africa… Viruses B263R…
## 10 Contig2728381_4 gb|QGM12922.2| 3.74e-52 10497 Africa… Viruses pB263…
## # … with 1,456 more rows, and abbreviated variable names ¹sscinames,
## # ²sskingdoms
datatable(bin148_viruses)